library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(harrietr)
## Registered S3 method overwritten by 'treeio':
## method from
## root.phylo ape
library(here)
## here() starts at /Users/giulieris/OneDrive - The University of Melbourne/R/VANANZ_phenotypes_github
library(ggtree)
## ggtree v2.0.1 For help: https://yulab-smu.github.io/treedata-book/
##
## If you use ggtree in published research, please cite the most appropriate paper(s):
##
## [36m-[39m Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution 2018, 35(12):3041-3043. doi: 10.1093/molbev/msy194
## [36m-[39m Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
##
## Attaching package: 'ggtree'
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:magrittr':
##
## inset
## The following object is masked from 'package:tidyr':
##
## expand
library(phytools)
## Loading required package: ape
##
## Attaching package: 'ape'
## The following object is masked from 'package:ggtree':
##
## rotate
## The following object is masked from 'package:ggpubr':
##
## rotate
## Loading required package: maps
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = here())
print(paste("My working directory is:" ,here()))
## [1] "My working directory is: /Users/giulieris/OneDrive - The University of Melbourne/R/VANANZ_phenotypes_github"
rm(list = ls())
snippy_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_pair_id_snps.mask.tab") %>%
arrange(PAIR_ID)
## Parsed with column specification:
## cols(
## PAIR_ID = col_character(),
## REFERENCE = col_character(),
## ISOLATE = col_character(),
## CHROM = col_character(),
## POS = col_double(),
## TYPE = col_character(),
## REF = col_character(),
## ALT = col_character(),
## EVIDENCE = col_character(),
## FTYPE = col_character(),
## STRAND = col_character(),
## NT_POS = col_character(),
## AA_POS = col_character(),
## EFFECT = col_character(),
## LOCUS_TAG = col_character(),
## GENE = col_character(),
## PRODUCT = col_character()
## )
snippy_data
## # A tibble: 142,030 x 17
## PAIR_ID REFERENCE ISOLATE CHROM POS TYPE REF ALT EVIDENCE FTYPE STRAND
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 GP-0002 BPH2705.… BPH2704 cont… 32669 snp T A A:23 T:0 <NA> <NA>
## 2 GP-0002 BPH2705.… BPH2704 cont… 14 snp G T T:29 G:0 <NA> <NA>
## 3 GP-0002 BPH2705.… BPH2704 cont… 521 snp C A A:23 C:0 <NA> <NA>
## 4 GP-0002 BPH2705.… BPH2704 cont… 389 snp T A A:18 T:0 <NA> <NA>
## 5 GP-0002 BPH2705.… BPH2704 cont… 61262 snp A C C:25 A:0 CDS +
## 6 GP-0002 BPH2705.… BPH2704 cont… 59956 snp C A A:21 C:0 CDS +
## 7 GP-0002 BPH2705.… BPH2704 cont… 38034 snp C A A:41 C:0 CDS +
## 8 GP-0002 BPH2705.… BPH2704 cont… 37962 snp A C C:38 A:0 CDS +
## 9 GP-0002 BPH2705.… BPH2704 cont… 2728 snp A T T:18 A:0 <NA> <NA>
## 10 GP-0002 BPH2705.… BPH2704 cont… 2695 snp C A A:20 C:0 <NA> <NA>
## # … with 142,020 more rows, and 6 more variables: NT_POS <chr>, AA_POS <chr>,
## # EFFECT <chr>, LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>
# modify snippy output:
# 1) generate iso1 and iso2 for merging with the phenotypic analysis
# 2) modify CHROM (contig name) to be consistent with the labelling in bed files down the track
# 3) extract mutation effects for easier interpretation
source("../Functions/aa_convert.R")
snippy_data_modified <- snippy_data %>%
mutate(iso1 = str_remove(REFERENCE, ".gbk"),
iso2 = ISOLATE,
CHROM = str_c(iso1, "_", CHROM)) %>%
separate(EFFECT,
into = c("EFFTYPE", "NUCLEOTIDE_CHANGE", "MUTATION"),
sep = "\\s",
remove = T,
extra = "merge") %>%
mutate(NUCLEOTIDE_CHANGE = str_remove(NUCLEOTIDE_CHANGE, "c."),
MUTATION = str_remove(MUTATION, "p."),
MUTATION_SHORT = aa_convert(MUTATION)) %>%
separate(AA_POS,
into = c("AA_POS", "AA_LENGTH"),
sep = "/",
remove = T) %>%
mutate_at(vars(starts_with("AA")),
as.numeric)
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 488 rows [458,
## 459, 554, 555, 940, 941, 942, 943, 944, 994, 995, 996, 997, 998, 1048, 1049,
## 1050, 1051, 1052, 5899, ...].
## Parsed with column specification:
## cols(
## `Amino acid` = col_character(),
## `Three letter code` = col_character(),
## `One letter code` = col_character()
## )
snippy_data_modified
## # A tibble: 142,030 x 23
## PAIR_ID REFERENCE ISOLATE CHROM POS TYPE REF ALT EVIDENCE FTYPE STRAND
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 GP-0002 BPH2705.… BPH2704 BPH2… 32669 snp T A A:23 T:0 <NA> <NA>
## 2 GP-0002 BPH2705.… BPH2704 BPH2… 14 snp G T T:29 G:0 <NA> <NA>
## 3 GP-0002 BPH2705.… BPH2704 BPH2… 521 snp C A A:23 C:0 <NA> <NA>
## 4 GP-0002 BPH2705.… BPH2704 BPH2… 389 snp T A A:18 T:0 <NA> <NA>
## 5 GP-0002 BPH2705.… BPH2704 BPH2… 61262 snp A C C:25 A:0 CDS +
## 6 GP-0002 BPH2705.… BPH2704 BPH2… 59956 snp C A A:21 C:0 CDS +
## 7 GP-0002 BPH2705.… BPH2704 BPH2… 38034 snp C A A:41 C:0 CDS +
## 8 GP-0002 BPH2705.… BPH2704 BPH2… 37962 snp A C C:38 A:0 CDS +
## 9 GP-0002 BPH2705.… BPH2704 BPH2… 2728 snp A T T:18 A:0 <NA> <NA>
## 10 GP-0002 BPH2705.… BPH2704 BPH2… 2695 snp C A A:20 C:0 <NA> <NA>
## # … with 142,020 more rows, and 12 more variables: NT_POS <chr>, AA_POS <dbl>,
## # AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## # LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>, iso1 <chr>, iso2 <chr>,
## # MUTATION_SHORT <chr>
snippy_data_modified %>%
.$ISOLATE %>%
n_distinct()
## [1] 253
df_n_mutations <- snippy_data_modified %>%
count(PAIR_ID, sort = T)
df_n_mutations %>%
ggplot(aes(x = n)) +
# geom_histogram() +
geom_density() +
theme_bw()
distant_pairs <- df_n_mutations %>%
filter(n > 100) %>%
.$PAIR_ID
snippy_data_modified_no_distant_pairs <- snippy_data_modified %>%
filter(!PAIR_ID %in% distant_pairs)
rm(distant_pairs, df_n_mutations)
protein_clusters_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/mutated_proteins.cd-hit.tab") %>%
rename_all(funs(str_c(., "_prot")))
## Parsed with column specification:
## cols(
## id = col_character(),
## clstr = col_double(),
## clstr_size = col_double(),
## length = col_double(),
## clstr_rep = col_double(),
## clstr_iden = col_character(),
## clstr_cov = col_character()
## )
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
nebraska_clusters <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/representative_proteins_FPR3757_cd-hit.tab") %>%
mutate(source = if_else(str_detect(id, "SAUSA300"), "FPR3757", "mutated_proteins")) %>%
group_by(clstr) %>%
filter(any(str_detect(id, "BPH"))) %>%
group_by(clstr, source) %>%
arrange(desc(clstr_cov, clstr_id)) %>%
filter(!(source == "FPR3757" & row_number() > 1)) %>%
mutate(source_long = str_c(source, "_", row_number())) %>%
ungroup() %>%
select(id, clstr, source_long) %>%
pivot_wider(names_from = source_long, values_from = id) %>%
pivot_longer(cols = starts_with("mutated_proteins"), names_to = "source_long", values_to = "id_prot") %>%
drop_na(id_prot) %>%
select(id_prot, nebraska_locus_tag = FPR3757_1)
## Parsed with column specification:
## cols(
## id = col_character(),
## clstr = col_double(),
## clstr_size = col_double(),
## length = col_double(),
## clstr_rep = col_double(),
## clstr_iden = col_character(),
## clstr_cov = col_character()
## )
df_neb <- read_csv("Ideas_Grant_2020_analysis/Raw_data/nebraska_all_proteins.csv") %>%
rename(neb_mutant_id = `Strain Name`, neb_gene = `Gene name`, neb_product = `gene discription`)
## Parsed with column specification:
## cols(
## `Strain Name` = col_character(),
## plate384 = col_double(),
## Well384 = col_character(),
## `Gene name` = col_character(),
## `gene discription` = col_character(),
## nebraska_locus_tag = col_character(),
## Row384 = col_character(),
## Column384 = col_double(),
## nebraska_aa_seq = col_character()
## )
protein_clusters_data <- protein_clusters_data %>%
left_join(nebraska_clusters) %>%
group_by(clstr_prot) %>%
mutate(nebraska_locus_tag = nebraska_locus_tag[which(clstr_rep_prot == 1)]) %>%
left_join(df_neb)
## Joining, by = "id_prot"
## Joining, by = "nebraska_locus_tag"
protein_seq_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/mutated_proteins.tab") %>%
rename_all(funs(str_c(., "_prot")))
## Parsed with column specification:
## cols(
## FASTA_ID = col_character(),
## SEQUENCE = col_character()
## )
snippy_data_modified_proteins <- snippy_data_modified_no_distant_pairs %>%
left_join(protein_clusters_data,
by = c("LOCUS_TAG" = "id_prot")) %>%
left_join(protein_seq_data,
by = c("LOCUS_TAG" = "FASTA_ID_prot"))
snippy_data_modified_proteins
## # A tibble: 104,035 x 39
## PAIR_ID REFERENCE ISOLATE CHROM POS TYPE REF ALT EVIDENCE FTYPE STRAND
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 GP-0002 BPH2705.… BPH2704 BPH2… 32669 snp T A A:23 T:0 <NA> <NA>
## 2 GP-0002 BPH2705.… BPH2704 BPH2… 14 snp G T T:29 G:0 <NA> <NA>
## 3 GP-0002 BPH2705.… BPH2704 BPH2… 521 snp C A A:23 C:0 <NA> <NA>
## 4 GP-0002 BPH2705.… BPH2704 BPH2… 389 snp T A A:18 T:0 <NA> <NA>
## 5 GP-0002 BPH2705.… BPH2704 BPH2… 61262 snp A C C:25 A:0 CDS +
## 6 GP-0002 BPH2705.… BPH2704 BPH2… 59956 snp C A A:21 C:0 CDS +
## 7 GP-0002 BPH2705.… BPH2704 BPH2… 38034 snp C A A:41 C:0 CDS +
## 8 GP-0002 BPH2705.… BPH2704 BPH2… 37962 snp A C C:38 A:0 CDS +
## 9 GP-0002 BPH2705.… BPH2704 BPH2… 2728 snp A T T:18 A:0 <NA> <NA>
## 10 GP-0002 BPH2705.… BPH2704 BPH2… 2695 snp C A A:20 C:0 <NA> <NA>
## # … with 104,025 more rows, and 28 more variables: NT_POS <chr>, AA_POS <dbl>,
## # AA_LENGTH <dbl>, EFFTYPE <chr>, NUCLEOTIDE_CHANGE <chr>, MUTATION <chr>,
## # LOCUS_TAG <chr>, GENE <chr>, PRODUCT <chr>, iso1 <chr>, iso2 <chr>,
## # MUTATION_SHORT <chr>, clstr_prot <dbl>, clstr_size_prot <dbl>,
## # length_prot <dbl>, clstr_rep_prot <dbl>, clstr_iden_prot <chr>,
## # clstr_cov_prot <chr>, nebraska_locus_tag <chr>, neb_mutant_id <chr>,
## # plate384 <dbl>, Well384 <chr>, neb_gene <chr>, neb_product <chr>,
## # Row384 <chr>, Column384 <dbl>, nebraska_aa_seq <chr>, SEQUENCE_prot <chr>
col_names <- c("CHROM_intergenic",
"START",
"END",
"CHROM_protein",
"SOURCE",
"TYPE",
"START_flank_prot",
"END_flank_prot",
"SCORE",
"STRAND_flank_prot",
"PHASE",
"ATTRIBUTES_flank_prot",
"CHROM_mutation",
"POS_minus1",
"POS",
"PAIR_ID",
"REFERENCE",
"ISOLATE")
intergenic_regions_annotated <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_mutated.intergenic.annotated.bed",
col_names = col_names) %>%
select(CHROM = CHROM_intergenic,
START,
END,
START_flank_prot,
END_flank_prot,
STRAND_flank_prot,
ATTRIBUTES_flank_prot,
POS,
PAIR_ID,
REFERENCE,
ISOLATE)
## Parsed with column specification:
## cols(
## CHROM_intergenic = col_character(),
## START = col_double(),
## END = col_double(),
## CHROM_protein = col_character(),
## SOURCE = col_character(),
## TYPE = col_character(),
## START_flank_prot = col_double(),
## END_flank_prot = col_double(),
## SCORE = col_character(),
## STRAND_flank_prot = col_character(),
## PHASE = col_double(),
## ATTRIBUTES_flank_prot = col_character(),
## CHROM_mutation = col_character(),
## POS_minus1 = col_double(),
## POS = col_double(),
## PAIR_ID = col_character(),
## REFERENCE = col_character(),
## ISOLATE = col_character()
## )
upstream_proteins <- intergenic_regions_annotated %>%
group_by(CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE) %>%
filter(END_flank_prot < POS) %>%
rename_at(vars(ends_with("_flank_prot")),
funs(str_c(., "_upstream")))
downstream_proteins <- intergenic_regions_annotated %>%
group_by(CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE) %>%
filter(START_flank_prot > POS) %>%
rename_at(vars(ends_with("_flank_prot")),
funs(str_c(., "_downstream")))
intergenic_regions_annotated <- upstream_proteins %>%
left_join(downstream_proteins)
## Joining, by = c("CHROM", "START", "END", "POS", "PAIR_ID", "REFERENCE", "ISOLATE")
intergenic_regions_annotated
## # A tibble: 42,815 x 15
## # Groups: CHROM, START, END, POS, PAIR_ID, REFERENCE, ISOLATE [42,815]
## CHROM START END START_flank_pro… END_flank_prot_… STRAND_flank_pr…
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 BPH2… 213895 214187 212549 213895 +
## 2 BPH2… 162407 162758 162189 162407 -
## 3 BPH2… 162407 162758 162189 162407 -
## 4 BPH2… 162407 162758 162189 162407 -
## 5 BPH2… 174634 175871 174428 174634 +
## 6 BPH2… 174634 175871 174428 174634 +
## 7 BPH2… 178465 178709 178217 178465 +
## 8 BPH2… 181179 181441 180931 181179 +
## 9 BPH2… 231689 232169 230541 231689 +
## 10 BPH2… 98412 98874 97246 98412 -
## # … with 42,805 more rows, and 9 more variables:
## # ATTRIBUTES_flank_prot_upstream <chr>, POS <dbl>, PAIR_ID <chr>,
## # REFERENCE <chr>, ISOLATE <chr>, START_flank_prot_downstream <dbl>,
## # END_flank_prot_downstream <dbl>, STRAND_flank_prot_downstream <chr>,
## # ATTRIBUTES_flank_prot_downstream <chr>
rm(upstream_proteins, downstream_proteins)
intergenic_clusters_data <- read_tsv("Ideas_Grant_2020_analysis/Raw_data/all_mutated.intergenic.cd-hit.tab") %>%
separate(id, into = c("CHROM", "START", "END"), sep = "[:-]", remove = F) %>%
rename_at(vars(id, starts_with("clstr")),
funs(str_c(., "_intergenic"))) %>%
mutate_at(vars(START, END), as.numeric) %>%
distinct()
## Parsed with column specification:
## cols(
## id = col_character(),
## clstr = col_double(),
## clstr_size = col_double(),
## length = col_double(),
## clstr_rep = col_double(),
## clstr_iden = col_character(),
## clstr_cov = col_character()
## )
snippy_data_modified_proteins_intergenic_regions <- snippy_data_modified_proteins %>%
left_join(intergenic_regions_annotated,
by = c("CHROM", "POS", "PAIR_ID", "REFERENCE", "ISOLATE")) %>%
left_join(intergenic_clusters_data)
## Joining, by = c("CHROM", "START", "END")
df_phenotypes <- read_csv("Ideas_Grant_2020_analysis/Genetic_pairs_table/genetic_pairs_pheno_changes_mortality_switches.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## iso2 = col_character(),
## iso1 = col_character(),
## iso1_ST = col_character(),
## iso1_strain_group = col_character(),
## iso1_mortality = col_character(),
## iso1_included = col_logical(),
## iso1_scv = col_logical(),
## iso1_persister_type = col_character(),
## iso1_sample_type = col_character(),
## iso1_genes = col_character(),
## iso1_CC = col_character(),
## iso2_ST = col_character(),
## iso2_strain_group = col_character(),
## iso2_mortality = col_character(),
## iso2_included = col_logical(),
## iso2_scv = col_logical(),
## iso2_persister_type = col_character(),
## iso2_sample_type = col_character(),
## iso2_genes = col_character(),
## iso2_CC = col_character()
## # ... with 2 more columns
## )
## See spec(...) for full column specifications.
# check available phenotypes
# mortality
df_phenotypes %>%
select(iso1, iso2, switches) %>%
distinct() %>%
count(switches)
## # A tibble: 5 x 2
## switches n
## <chr> <int>
## 1 Died-Died 42
## 2 Died-Survived 20
## 3 Survived-Died 20
## 4 Survived-Survived 146
## 5 <NA> 6
# most mortality phenotypes are not available. Need to recalculate that
df_all_genetic_pairs_pheno <- read_csv("Ideas_Grant_2020_analysis/Genetic_pairs_table/df_all_genetic_pairs_pheno.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## iso2 = col_character(),
## iso1 = col_character(),
## iso1_ST = col_character(),
## iso1_strain_group = col_character(),
## iso1_mortality = col_character(),
## iso1_included = col_logical(),
## iso1_scv = col_logical(),
## iso1_persister_type = col_character(),
## iso1_sample_type = col_character(),
## iso1_genes = col_character(),
## iso1_CC = col_character(),
## iso2_ST = col_character(),
## iso2_strain_group = col_character(),
## iso2_mortality = col_character(),
## iso2_included = col_logical(),
## iso2_scv = col_logical(),
## iso2_persister_type = col_character(),
## iso2_sample_type = col_character(),
## iso2_genes = col_character(),
## iso2_CC = col_character()
## # ... with 1 more columns
## )
## See spec(...) for full column specifications.
df_all_genetic_pairs_pheno %>%
select(iso1, iso2, iso1_mortality, iso2_mortality) %>%
filter(is.na(iso1_mortality) | is.na(iso2_mortality))
## # A tibble: 90 x 4
## iso1 iso2 iso1_mortality iso2_mortality
## <chr> <chr> <chr> <chr>
## 1 BPH2751 BPH2750 Died <NA>
## 2 BPH2750 BPH2751 <NA> Died
## 3 BPH2888 BPH2887 Died <NA>
## 4 BPH2887 BPH2888 <NA> Died
## 5 BPH2898 BPH2897 Died <NA>
## 6 BPH2897 BPH2898 <NA> Died
## 7 BPH2966 BPH2965 Died <NA>
## 8 BPH2965 BPH2966 <NA> Died
## 9 BPH2988 BPH2987 Died <NA>
## 10 BPH2987 BPH2988 <NA> Died
## # … with 80 more rows
df_all_genetic_pairs_pheno_switches <- df_all_genetic_pairs_pheno %>%
mutate(switches = str_c(iso1_mortality, "-", iso2_mortality))
# count
df_all_genetic_pairs_pheno_switches %>%
select(iso1, iso2, switches) %>%
distinct() %>%
count(switches)
## # A tibble: 5 x 2
## switches n
## <chr> <int>
## 1 Died-Died 68
## 2 Died-Survived 316
## 3 Survived-Died 316
## 4 Survived-Survived 1694
## 5 <NA> 90
# df_mutations_phenotypes <- snippy_data_modified_proteins_intergenic_regions %>%
# left_join(df_phenotypes)
df_mortality_close_clusters <- df_all_genetic_pairs_pheno_switches %>%
arrange(iso2) %>%
filter(switches %in% c("Survived-Died")) %>%
group_by(iso2) %>%
mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
select(iso1, iso2, switches, iso_cluster)
df_mortality_close_clusters %>%
.$iso_cluster %>%
n_distinct()
## [1] 28
df_mortality_close_clusters %>%
.$iso1 %>%
n_distinct()
## [1] 76
df_mortality_close_clusters %>%
ggplot(aes(x = fct_infreq(iso_cluster))) +
geom_bar() +
coord_flip() +
labs(x = "") +
theme_bw()
df_mortality_close_pairs <- df_all_genetic_pairs_pheno_switches %>%
arrange(iso2) %>%
filter(switches %in% c("Survived-Died")) %>%
group_by(iso2) %>%
mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
select(iso1, iso2, switches, iso_cluster) %>%
arrange(iso2) %>%
mutate(same_iso2 = if_else(row_number() == 1, FALSE, lag(iso2) == iso2)) %>%
filter(!same_iso2) %>%
select(iso1, iso2, switches, iso_cluster) %>%
mutate(dataset = "pairs")
df_mortality_close_pairs %>%
.$iso_cluster %>%
n_distinct()
## [1] 28
iso1_pairs <- sort(df_mortality_close_pairs$iso1)
iso1_reserve <- df_all_genetic_pairs_pheno_switches %>%
arrange(iso2) %>%
filter(switches %in% c("Survived-Died")) %>%
group_by(iso2) %>%
mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
select(iso1, iso2, switches, iso_cluster) %>%
filter(!iso1 %in% iso1_pairs) %>%
.$iso1 %>%
unique() %>%
sort()
df_mortality_close_clusters_reserve <- df_all_genetic_pairs_pheno_switches %>%
arrange(iso2) %>%
filter(switches %in% c("Survived-Died")) %>%
group_by(iso2) %>%
mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
select(iso1, iso2, switches, iso_cluster) %>%
filter(iso1 %in% iso1_reserve) %>%
ungroup() %>%
arrange(iso1) %>%
distinct(iso1, .keep_all = T) %>%
mutate(dataset = "reserve")
df_mortality_close_clusters_2 <- bind_rows(df_mortality_close_pairs,
df_mortality_close_clusters_reserve)
df_mortality_close_clusters_2 %>%
.$iso_cluster %>%
n_distinct()
## [1] 28
df_mortality_close_clusters_2 %>%
.$iso1 %>%
n_distinct()
## [1] 76
df_mortality_close_clusters_2 %>%
ggplot(aes(x = fct_infreq(iso_cluster))) +
geom_bar() +
coord_flip() +
labs(x = "") +
theme_bw()
df_mutations_phenotypes_mortality_close_clusters <- snippy_data_modified_proteins %>%
right_join(df_mortality_close_clusters)
## Joining, by = c("iso1", "iso2")
df_convergence_mortality_clusters <- df_mutations_phenotypes_mortality_close_clusters %>%
drop_na(PRODUCT) %>%
filter(EFFTYPE != "synonymous_variant") %>%
group_by(clstr_prot, clstr_size_prot) %>%
mutate(n_clusters = n_distinct(iso_cluster),
n_pairs = n_distinct(PAIR_ID),
n_references = n_distinct(REFERENCE)) %>%
select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, switches, everything()) %>%
arrange(clstr_prot, desc(n_clusters))
t_mortality_clusters <- df_convergence_mortality_clusters %>%
group_by(clstr_prot) %>%
mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|")))%>%
mutate(phenotype = "SAB mortality") %>%
arrange(desc(n_clusters))
t_mortality_clusters
## # A tibble: 689 x 13
## # Groups: n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot,
## # nebraska_locus_tag, neb_mutant_id, neb_gene [689]
## n_clusters n_pairs clstr_prot clstr_size_prot SEQUENCE_prot nebraska_locus_…
## <int> <int> <dbl> <dbl> <chr> <chr>
## 1 13 25 7 73 MIMGNLRFQQEY… SAUSA300_0181
## 2 13 31 358 30 MLFHKKIESLIS… SAUSA300_2167
## 3 11 11 469 10 MKFSTLSEEEFT… SAUSA300_1141
## 4 11 13 1105 49 MSNQNYDYNKNE… SAUSA300_0754
## 5 11 17 569 20 MKIIHTADWHLG… SAUSA300_1242
## 6 11 22 211 25 MAKLLIMSIVSF… <NA>
## 7 11 54 140 56 MKFKSLITTTLA… <NA>
## 8 11 55 0 61 MNYRDKIQKFSI… SAUSA300_1327
## 9 10 17 65 27 MNMKKKEKHAIR… <NA>
## 10 10 26 435 66 MELLNRYNFVLF… SAUSA300_1991
## # … with 679 more rows, and 7 more variables: neb_mutant_id <chr>,
## # neb_gene <chr>, neb_product <chr>, GENE <chr>, PRODUCT <chr>,
## # MUTATION_SHORT <chr>, phenotype <chr>
df_phenotypes
## # A tibble: 234 x 111
## iso2 iso1 dist iso1_ST iso1_strain_gro… iso1_mecA iso1_mortality
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 BPH2… BPH2… 21 93 VAN-004 1 Survived
## 2 BPH2… BPH2… 21 93 VAN-004 1 Survived
## 3 BPH2… BPH2… 20 97 VAN-007 0 Survived
## 4 BPH2… BPH2… 20 97 VAN-007 0 Survived
## 5 BPH2… BPH2… 18 239 VAN-064 1 Survived
## 6 BPH2… BPH3… 16 239 VSS-287 1 Died
## 7 BPH2… BPH2… 20 239 VAN-155 1 Died
## 8 BPH2… BPH2… 29 45 VAN-011 0 Survived
## 9 BPH2… BPH2… 29 45 VAN-011 0 Survived
## 10 BPH2… BPH2… 18 239 VAN-082 1 Survived
## # … with 224 more rows, and 104 more variables: iso1_intrahost_index <dbl>,
## # iso1_included <lgl>, iso1_unrelated_to_index <dbl>, iso1_mut_count <dbl>,
## # iso1_scv <lgl>, iso1_vanco_difference <dbl>, iso1_persister_type <chr>,
## # iso1_intrahost_sampledelay <dbl>, iso1_sample_type <chr>, iso1_genes <chr>,
## # iso1_CC <chr>, iso2_ST <chr>, iso2_strain_group <chr>, iso2_mecA <dbl>,
## # iso2_mortality <chr>, iso2_intrahost_index <dbl>, iso2_included <lgl>,
## # iso2_unrelated_to_index <dbl>, iso2_mut_count <dbl>, iso2_scv <lgl>,
## # iso2_vanco_difference <dbl>, iso2_persister_type <chr>,
## # iso2_intrahost_sampledelay <dbl>, iso2_sample_type <chr>, iso2_genes <chr>,
## # iso2_CC <chr>, CC <chr>, iso1_time_of_max_rate_OD <dbl>,
## # iso1_max_rate_OD <dbl>, iso1_doubling_time_OD <dbl>, iso1_AUC_OD <dbl>,
## # iso1_time_of_max_OD <dbl>, iso1_max_OD <dbl>, iso1_time_of_min_OD <dbl>,
## # iso1_min_OD <dbl>, iso1_end_point_OD <dbl>,
## # iso1_end_point_timepoint_OD <dbl>, iso1_AUC_death <dbl>,
## # iso1_time_of_max_death <dbl>, iso1_max_death <dbl>,
## # iso1_time_of_min_death <dbl>, iso1_min_death <dbl>,
## # iso1_time_of_max_rate_death <dbl>, iso1_max_rate_death <dbl>,
## # iso1_doubling_time_death <dbl>, iso1_end_point_death <dbl>,
## # iso1_end_point_timepoint_death <dbl>, iso2_time_of_max_rate_OD <dbl>,
## # iso2_max_rate_OD <dbl>, iso2_doubling_time_OD <dbl>, iso2_AUC_OD <dbl>,
## # iso2_time_of_max_OD <dbl>, iso2_max_OD <dbl>, iso2_time_of_min_OD <dbl>,
## # iso2_min_OD <dbl>, iso2_end_point_OD <dbl>,
## # iso2_end_point_timepoint_OD <dbl>, iso2_AUC_death <dbl>,
## # iso2_time_of_max_death <dbl>, iso2_max_death <dbl>,
## # iso2_time_of_min_death <dbl>, iso2_min_death <dbl>,
## # iso2_time_of_max_rate_death <dbl>, iso2_max_rate_death <dbl>,
## # iso2_doubling_time_death <dbl>, iso2_end_point_death <dbl>,
## # iso2_end_point_timepoint_death <dbl>, delta_time_of_max_rate_OD <dbl>,
## # delta_max_rate_OD <dbl>, delta_doubling_time_OD <dbl>, delta_AUC_OD <dbl>,
## # delta_time_of_max_OD <dbl>, delta_time_of_min_OD <dbl>, delta_max_OD <dbl>,
## # delta_min_OD <dbl>, delta_end_point_OD <dbl>,
## # delta_time_of_max_rate_death <dbl>, delta_max_rate_death <dbl>,
## # delta_doubling_time_death <dbl>, delta_AUC_death <dbl>,
## # delta_time_of_max_death <dbl>, delta_time_of_min_death <dbl>,
## # delta_max_death <dbl>, delta_min_death <dbl>, delta_end_point_death <dbl>,
## # log2fc_time_of_max_rate_OD <dbl>, log2fc_max_rate_OD <dbl>,
## # log2fc_doubling_time_OD <dbl>, log2fc_AUC_OD <dbl>,
## # log2fc_time_of_max_OD <dbl>, log2fc_time_of_min_OD <dbl>,
## # log2fc_max_OD <dbl>, log2fc_min_OD <dbl>, log2fc_end_point_OD <dbl>,
## # log2fc_time_of_max_rate_death <dbl>, log2fc_max_rate_death <dbl>,
## # log2fc_doubling_time_death <dbl>, log2fc_AUC_death <dbl>,
## # log2fc_time_of_max_death <dbl>, log2fc_time_of_min_death <dbl>, …
for (var in str_subset(colnames(df_phenotypes), "iso1_.*_OD")){
p <- ggdensity(data = df_phenotypes, x = var, fill = "red") +
theme_bw()
print(p)
}
df_phenotypes %>%
ggplot(aes(x = abs(delta_AUC_OD))) +
geom_density(fill = "red") +
theme_bw()
df_plot <- df_phenotypes %>%
mutate_at(vars(matches("delta_.*_OD")),
abs)
for (var in str_subset(colnames(df_plot), "delta_.*_OD")){
p <- ggdensity(data = df_plot, x = var, fill = "red") +
theme_bw()
print(p)
}
Based on the exploration above, we define a delta AUC threshold of 30 for discordant pairs (60 would be a more stringent one)
# df_growth_close_clusters <- df_phenotypes %>%
# arrange(iso2) %>%
# filter(delta_AUC_OD < - 10) %>%
# group_by(iso2) %>%
# mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
# ungroup() %>%
# distinct(iso1, .keep_all =T) %>%
# select(iso1, iso2, iso_cluster, delta_AUC_OD, iso1_AUC_OD, iso2_AUC_OD)
df_growth_close_clusters <- df_phenotypes %>%
arrange(iso2) %>%
filter(delta_AUC_OD < -10) %>%
group_by(iso2) %>%
mutate(iso_cluster = str_c(iso2, "-cluster"),
n = n()) %>%
select(iso1, iso2, iso_cluster, n, delta_AUC_OD, iso1_AUC_OD, iso2_AUC_OD) %>%
arrange(iso1, n) %>%
group_by(iso1) %>%
mutate(keep = row_number() == 1) %>%
filter(keep)
df_growth_close_clusters %>%
.$iso_cluster %>%
n_distinct()
## [1] 22
df_growth_close_clusters %>%
ggplot(aes(x = fct_infreq(iso_cluster))) +
geom_bar() +
coord_flip() +
labs(x = "") +
theme_bw()
df_mutations_growth_close_clusters <- snippy_data_modified_proteins %>%
right_join(df_growth_close_clusters)
## Joining, by = c("iso1", "iso2")
nrow(df_mutations_growth_close_clusters)
## [1] 1100
df_convergence_growth_clusters <- df_mutations_growth_close_clusters %>%
drop_na(PRODUCT) %>%
filter(EFFTYPE != "synonymous_variant") %>%
group_by(clstr_prot, clstr_size_prot) %>%
mutate(n_clusters = n_distinct(iso_cluster),
n_pairs = n_distinct(PAIR_ID),
n_references = n_distinct(REFERENCE)) %>%
select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, delta_AUC_OD, everything()) %>%
arrange(clstr_prot, desc(n_clusters))
t_growth_clusters <- df_convergence_growth_clusters %>%
group_by(clstr_prot) %>%
mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|"))) %>%
mutate(phenotype = "Growth rate (AUC)") %>%
arrange(desc(n_clusters))
for (var in str_subset(colnames(df_phenotypes), "iso1_.*_death")){
p <- ggdensity(data = df_phenotypes, x = var, fill = "blue") +
theme_bw()
print(p)
}
df_plot <- df_phenotypes %>%
mutate_at(vars(matches("delta_.*_death")),
abs)
for (var in str_subset(colnames(df_plot), "delta_.*_death")){
p <- ggdensity(data = df_plot, x = var, fill = "blue") +
theme_bw()
print(p)
}
# df_cell_death_close_clusters <- df_phenotypes %>%
# arrange(iso2) %>%
# filter(delta_AUC_death < - 40) %>%
# group_by(iso2) %>%
# mutate(iso_cluster = str_c(iso2, "-cluster")) %>%
# ungroup() %>%
# distinct(iso1, .keep_all =T) %>%
# select(iso1, iso2, iso_cluster, delta_AUC_death, iso1_AUC_death, iso2_AUC_death)
df_cell_death_close_clusters <- df_phenotypes %>%
arrange(iso2) %>%
filter(delta_AUC_death < - 40) %>%
group_by(iso2) %>%
mutate(iso_cluster = str_c(iso2, "-cluster"),
n = n()) %>%
select(iso1, iso2, iso_cluster, n, delta_AUC_death, iso1_AUC_death, iso2_AUC_death) %>%
arrange(iso1, n) %>%
group_by(iso1) %>%
mutate(keep = row_number() == 1) %>%
filter(keep)
df_cell_death_close_clusters %>%
.$iso_cluster %>%
n_distinct()
## [1] 13
df_cell_death_close_clusters %>%
ggplot(aes(x = fct_infreq(iso_cluster))) +
geom_bar() +
coord_flip() +
labs(x = "") +
theme_bw()
df_mutations_cell_death_close_clusters <- snippy_data_modified_proteins %>%
right_join(df_cell_death_close_clusters)
## Joining, by = c("iso1", "iso2")
nrow(df_mutations_cell_death_close_clusters)
## [1] 857
df_convergence_cell_death_clusters <- df_mutations_cell_death_close_clusters %>%
drop_na(PRODUCT) %>%
filter(EFFTYPE != "synonymous_variant") %>%
group_by(clstr_prot, clstr_size_prot) %>%
mutate(n_clusters = n_distinct(iso_cluster),
n_pairs = n_distinct(PAIR_ID),
n_references = n_distinct(REFERENCE)) %>%
select(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, GENE, PRODUCT, MUTATION_SHORT, iso_cluster, PAIR_ID, delta_AUC_death, everything()) %>%
arrange(clstr_prot, desc(n_clusters))
t_cell_death_clusters <- df_convergence_cell_death_clusters %>%
group_by(clstr_prot) %>%
mutate(SEQUENCE_prot = SEQUENCE_prot[1]) %>%
distinct(n_references, n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, GENE, PRODUCT, MUTATION_SHORT, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
group_by(n_clusters, n_pairs, clstr_prot, clstr_size_prot, SEQUENCE_prot, nebraska_locus_tag, neb_mutant_id, neb_gene, neb_product) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|"))) %>%
mutate(phenotype = "Cell death (AUC)")
t <- bind_rows(t_mortality_clusters,
t_growth_clusters,
t_cell_death_clusters)
t_convergent <- t %>%
ungroup() %>%
select(clstr_prot, starts_with("neb"), n_clusters, phenotype) %>%
pivot_wider(names_from = phenotype, names_prefix = "n_clusters_", values_from = n_clusters, values_fill = list(n_clusters = 0)) %>%
left_join(t) %>%
group_by_at(vars(clstr_prot, starts_with("n_clusters_"),
starts_with("neb"))) %>%
summarise_at(vars(GENE, PRODUCT, MUTATION_SHORT),
funs(str_c(unique(.), collapse = "|")))
## Joining, by = c("clstr_prot", "nebraska_locus_tag", "neb_mutant_id", "neb_gene", "neb_product")
t_venn <- t_convergent %>%
ungroup() %>%
mutate_at(vars(starts_with("n_clusters")),
funs(mutated = as.integer(. > 0))) %>%
unite(col = "intersect", ends_with("mutated"), sep = "") %>%
mutate(intersect = recode(intersect,
`111` = "all",
`100` = "SAB mortality only",
`110` = "SAB mortality and growth rate",
`101` = "SAB mortality and cell death",
`011` = "Growth rate and cell death",
`010` = "Growth rate only",
`011` = "Growth rate and cell death",
`001` = "Cell death only"))
t_venn_synthesis <- t_venn %>%
count(intersect)
t_venn_stringent <- t_convergent %>%
ungroup() %>%
mutate_at(vars(starts_with("n_clusters")),
funs(mutated = as.integer(. > 1))) %>%
unite(col = "intersect", ends_with("mutated"), sep = "") %>%
filter(intersect != "000") %>%
mutate(intersect = recode(intersect,
`111` = "all",
`100` = "SAB mortality only",
`110` = "SAB mortality and growth rate",
`101` = "SAB mortality and cell death",
`011` = "Growth rate and cell death",
`010` = "Growth rate only",
`011` = "Growth rate and cell death",
`001` = "Cell death only"))
t_venn_stringent_synthesis <- t_venn_stringent %>%
count(intersect)
df_mortality_close_clusters_long <- df_mortality_close_clusters %>%
select(-switches) %>%
ungroup() %>%
pivot_longer(iso1:iso2, names_to = "iso_rank", values_to = "ISOLATE") %>%
distinct() %>%
transmute(ISOLATE, iso_cluster, mortality = if_else(iso_rank == "iso2", "Died", "Survived"))
# Add some metadata
df_genetic_pairs_unique <- df_all_genetic_pairs_pheno %>%
select(starts_with("iso1")) %>%
distinct() %>%
rename(ISOLATE = iso1) %>%
rename_at(vars(starts_with("iso1")),
funs(str_remove(., "iso1_")))
df_mortality_close_clusters_long_with_metadata <- df_mortality_close_clusters_long %>%
left_join(df_genetic_pairs_unique)
## Joining, by = c("ISOLATE", "mortality")
# List of unique isolates
mortality_close_clusters <- sort(unique(df_mortality_close_clusters_long$ISOLATE))
write_lines(mortality_close_clusters, "Ideas_Grant_2020_analysis/Genetic_pairs_table/mortality_close_clusters.txt")
mortality_tree <- read.tree("Ideas_Grant_2020_analysis/Raw_data/mortality_close_clusters.tree") %>%
phytools::midpoint.root()
p <- ggtree(mortality_tree, layout = "circular")
metadata_matrix <- df_mortality_close_clusters_long_with_metadata %>%
select(ISOLATE, CC, mortality) %>%
distinct() %>%
column_to_rownames("ISOLATE")
p2 <- gheatmap(p, metadata_matrix %>% select(mortality)) +
scale_fill_manual(values = c("orange", "navy"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p2
library(ggnewscale)
p3 <- p2 + new_scale_fill()
p4 <- gheatmap(p3, metadata_matrix %>%
select(CC), offset = .05) +
scale_fill_viridis_d()
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
top_genes <- t_mortality_clusters %>% arrange(desc(n_clusters)) %>% ungroup() %>% filter(row_number() <= 10)
df_top_genes <- df_convergence_mortality_clusters %>%
ungroup() %>%
filter(clstr_prot %in% top_genes$clstr_prot) %>%
distinct(n_clusters, ISOLATE, clstr_prot, GENE, PRODUCT, neb_mutant_id, neb_gene, neb_product, MUTATION_SHORT)
top_genes_matrix <- df_top_genes %>%
distinct(ISOLATE, clstr_prot) %>%
mutate(mutated = T) %>%
pivot_wider(names_from = clstr_prot, values_from = mutated, values_fill = list(mutated = F)) %>%
column_to_rownames("ISOLATE")
p5 <- p4 + new_scale_fill()
p6 <- gheatmap(p5, top_genes_matrix, offset = .2) +
scale_fill_manual(values = c("white", "red"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p6
df_growth_close_clusters_long <- df_growth_close_clusters %>%
ungroup() %>%
select(iso1, iso2, iso_cluster) %>%
pivot_longer(iso1:iso2, names_to = "iso_rank", values_to = "ISOLATE") %>%
distinct(ISOLATE, iso_cluster)
# Add some metadata
df_genetic_pairs_unique <- df_all_genetic_pairs_pheno %>%
select(starts_with("iso1")) %>%
distinct() %>%
rename(ISOLATE = iso1) %>%
rename_at(vars(starts_with("iso1")),
funs(str_remove(., "iso1_")))
df_growth_close_clusters_long_with_metadata <- df_growth_close_clusters_long %>%
left_join(df_genetic_pairs_unique)
## Joining, by = "ISOLATE"
# List of unique isolates
growth_close_clusters <- sort(unique(df_growth_close_clusters_long$ISOLATE))
write_lines(growth_close_clusters, "Ideas_Grant_2020_analysis/Genetic_pairs_table/growth_close_clusters.txt")
growth_tree <- read.tree("Ideas_Grant_2020_analysis/Raw_data/growth_close_clusters.tree") %>%
phytools::midpoint.root()
p <- ggtree(growth_tree, layout = "circular")
metadata_matrix <- df_growth_close_clusters_long_with_metadata %>%
select(ISOLATE, CC, AUC_OD) %>%
distinct() %>%
column_to_rownames("ISOLATE")
p2 <- gheatmap(p, metadata_matrix %>% select(AUC_OD)) +
scale_fill_viridis_c()
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p2
library(ggnewscale)
# p3 <- p2 + new_scale_fill()
# p4 <- gheatmap(p3, metadata_matrix %>%
# select(CC), offset = .2) +
# scale_fill_viridis_d()
# p4
top_genes <- t_growth_clusters %>% arrange(desc(n_clusters)) %>% ungroup() %>% filter(row_number() <= 10)
df_top_genes <- df_convergence_growth_clusters %>%
ungroup() %>%
filter(clstr_prot %in% top_genes$clstr_prot) %>%
distinct(n_clusters, ISOLATE, clstr_prot, GENE, PRODUCT, neb_mutant_id, neb_gene, neb_product, MUTATION_SHORT)
top_genes_matrix <- df_top_genes %>%
distinct(ISOLATE, clstr_prot) %>%
mutate(mutated = T) %>%
pivot_wider(names_from = clstr_prot, values_from = mutated, values_fill = list(mutated = F)) %>%
column_to_rownames("ISOLATE")
p5 <- p2 + new_scale_fill()
p6 <- gheatmap(p5, top_genes_matrix, offset = .2) +
scale_fill_manual(values = c("white", "red"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p6
df_cell_death_close_clusters_long <- df_cell_death_close_clusters %>%
ungroup() %>%
select(iso1, iso2, iso_cluster) %>%
pivot_longer(iso1:iso2, names_to = "iso_rank", values_to = "ISOLATE") %>%
distinct(ISOLATE, iso_cluster)
# Add some metadata
df_cell_death_close_clusters_long_with_metadata <- df_cell_death_close_clusters_long %>%
left_join(df_genetic_pairs_unique)
## Joining, by = "ISOLATE"
# List of unique isolates
cell_death_close_clusters <- sort(unique(df_growth_close_clusters_long$ISOLATE))
write_lines(cell_death_close_clusters, "Ideas_Grant_2020_analysis/Genetic_pairs_table/cell_death_close_clusters.txt")
growth_tree <- read.tree("Ideas_Grant_2020_analysis/Raw_data/cell_death_close_clusters.tree") %>%
phytools::midpoint.root()
p <- ggtree(growth_tree, layout = "circular")
metadata_matrix <- df_cell_death_close_clusters_long_with_metadata %>%
select(ISOLATE, CC, AUC_death) %>%
distinct() %>%
column_to_rownames("ISOLATE")
p2 <- gheatmap(p, metadata_matrix %>% select(AUC_death)) +
scale_fill_viridis_c()
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p2
library(ggnewscale)
# p3 <- p2 + new_scale_fill()
# p4 <- gheatmap(p3, metadata_matrix %>%
# select(CC), offset = .2) +
# scale_fill_viridis_d()
# p4
top_genes <- t_cell_death_clusters %>% arrange(desc(n_clusters)) %>% ungroup() %>% filter(row_number() <= 10)
df_top_genes <- df_convergence_cell_death_clusters %>%
ungroup() %>%
filter(clstr_prot %in% top_genes$clstr_prot) %>%
distinct(n_clusters, ISOLATE, clstr_prot, GENE, PRODUCT, neb_mutant_id, neb_gene, neb_product, MUTATION_SHORT)
top_genes_matrix <- df_top_genes %>%
distinct(ISOLATE, clstr_prot) %>%
mutate(mutated = T) %>%
pivot_wider(names_from = clstr_prot, values_from = mutated, values_fill = list(mutated = F)) %>%
column_to_rownames("ISOLATE")
p5 <- p2 + new_scale_fill()
p6 <- gheatmap(p5, top_genes_matrix, offset = .2) +
scale_fill_manual(values = c("white", "red"))
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
p6